Stats_506_PS5

Author

Annie Yannan Niu

Problem set 5

Problem 1. OOP

a. Define the rational class

Function for GCD

library(Rcpp)

cppFunction('
int cpp_gcd(int a, int b) {
    while (b != 0) {
        int temp = b;
        b = a % b;
        a = temp;
    }
    return abs(a);
}
')
cpp_gcd(20, 25)
[1] 5

Function for LCM

cppFunction('
int cpp_lcm(int a, int b) {
    int x = a;
    int y = b;
    while (y != 0) {
            int temp = y;
            y = x % y;
            x = temp;
        }
    return abs(a * b) / abs(x);
}
')

# Test the functions
cpp_lcm(20, 25)   # Should return 100
[1] 100
# Define the rational S4 class
setClass("rational",
         slots = list(numerator = "numeric",
                      denominator = "numeric"))

# Constructor 
rational <- function(numerator, denominator = 1) {
    # if (denominator == 0) {
    #     stop("Denominator cannot be zero.")
    # }
    obj <- new("rational", numerator = numerator, denominator = denominator)
    validObject(obj)  # Check validity
    return(obj)
}

# Validator 
setValidity("rational", function(object) {
    if (object@denominator == 0) {
        return("Denominator cannot be zero.")
    }
    TRUE
})
Class "rational" [in ".GlobalEnv"]

Slots:
                              
Name:    numerator denominator
Class:     numeric     numeric
# Show method 
setMethod("show", "rational", function(object) {
    cat(object@numerator, "/", object@denominator, "\n")
})

# Simplify method 
setGeneric("simplify", function(object) standardGeneric("simplify"))
[1] "simplify"
setMethod("simplify", "rational", function(object) {
    a <- abs(object@numerator)
    b <- abs(object@denominator)
    gcd <- cpp_gcd(a, b)
    
    # Simplify by devide gcd
    new("rational", numerator = object@numerator / gcd, denominator = object@denominator / gcd)
})

# Quotient method 
setGeneric("quotient", function(object, digit = 2) standardGeneric("quotient"))
[1] "quotient"
setMethod("quotient", "rational", function(object, digit = 2) {
    if (digit %% 1 != 0) {
        stop("Digits must be a non-negative number.")
    }
    result <- object@numerator / object@denominator
    format_string <- paste0("%.", digit, "f")
    round_r <- sprintf(format_string,round(result, digit))
    print(round_r)
    return(result)
})

# Arithmetic methods 
setMethod("+", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
    lcm_denom <- cpp_lcm(e1@denominator, e2@denominator)
    new_numerator <- e1@numerator * (lcm_denom / e1@denominator) + e2@numerator * (lcm_denom / e2@denominator)
    simplify(new("rational", numerator = new_numerator, denominator = lcm_denom))
})

setMethod("-", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
    lcm_denom <- cpp_lcm(e1@denominator, e2@denominator)
    new_numerator <- e1@numerator * (lcm_denom / e1@denominator) - e2@numerator * (lcm_denom / e2@denominator)
    simplify(new("rational", numerator = new_numerator, denominator = lcm_denom))
})

setMethod("*", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
    simplify(new("rational", numerator = e1@numerator * e2@numerator, denominator = e1@denominator * e2@denominator))
})

setMethod("/", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
    if (e2@numerator == 0) stop("Cannot divide by zero.")
    simplify(new("rational", numerator = e1@numerator * e2@denominator, denominator = e1@denominator * e2@numerator))
})

b. Use your rational class to create three objects:

# Create instances of Rational numbers
r1 <- rational(24, 6)
r2 <- rational(7, 230)
r3 <- rational(0, 4)
print(c(r1, r2, r3))
[[1]]
24 / 6 

[[2]]
7 / 230 

[[3]]
0 / 4 
r1
24 / 6 
r3
0 / 4 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in r2/r3: Cannot divide by zero.
quotient(r1)
[1] "4.00"
[1] 4
quotient(r2)
[1] "0.03"
[1] 0.03043478
quotient(r2, digit = 3)
[1] "0.030"
[1] 0.03043478
quotient(r2, digit = 3.14)
Error in quotient(r2, digit = 3.14): Digits must be a non-negative number.
quotient(r2, digit = "avocado")
Error in digit%%1: non-numeric argument to binary operator
q2 <- quotient(r2, digit = 3)
[1] "0.030"
q2
[1] 0.03043478
quotient(r3)
[1] "0.00"
[1] 0
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 

c. Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.

r4 <- rational(24, 0)
Error in validObject(.Object): invalid class "rational" object: Denominator cannot be zero.

Problem 2 - plotly

a. Regenerate your plot which addresses the second question from last time:

Does the distribution of genre of sales across years appear to change?

# Import data
art_sales <- read.csv("/Users/nynn/Library/CloudStorage/OneDrive-Umich/Umich course/2024_Fall/Stats 506/Stats_506_PS/Stats_506_PS4/df_for_ml_improved_new_market.csv")

# Load necessary libraries
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
# Summarize data by year and genre, including NAs/no info as others
genre_summary <- art_sales %>%
  group_by(year) %>%
  summarise(
    Photography = sum(Genre___Photography, na.rm = TRUE),
    Print = sum(Genre___Print, na.rm = TRUE),
    Sculpture = sum(Genre___Sculpture, na.rm = TRUE),
    Painting = sum(Genre___Painting, na.rm = TRUE),
    Other = sum(Genre___Photography == 0 & Genre___Print == 0 &
                  Genre___Sculpture == 0 & Genre___Painting == 0) 
  )

# Reshape data for Plotly
genre_long <- genre_summary %>%
  pivot_longer(cols = c(Photography, Print, Painting, Sculpture, Other), 
               names_to = "Genre", 
               values_to = "Count") %>%
  group_by(year) %>%
  mutate(Percentage = round(Count / sum(Count) * 100, 1))

# Absolute number plot
absolute_plot <- plot_ly(genre_long, x = ~as.factor(year), y = ~Count, color = ~Genre, type = "bar") %>%
  layout(
    title = "Distribution of Genre Over Years - Absolute Number",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Number of Sales"),
    barmode = "stack",
    legend = list(title = list(text = "Genre"))
  )

absolute_plot
# Percentage plot (stacked to 100%)
percentage_plot <- plot_ly(genre_long, x = ~as.factor(year), y = ~Percentage, color = ~Genre, type = "bar", text = ~Genre) %>%
  layout(
    title = "Distribution of Genre Over Years - Percentage",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Percentage of Sales", ticksuffix = "%"),
    barmode = "stack",
    legend = list(title = list(text = "Genre"))
  )

percentage_plot

b. Generate an interactive plot with plotly that can address both of these questions from last time:

Is there a change in the sales price in USD over time?

How does the genre affect the change in sales price over time?

# Prepare data with median and IQR
df_yearly_price <- art_sales %>%
  group_by(year) %>%
  summarise(
    median_price_usd = median(price_usd, na.rm = TRUE),
    lower_q = quantile(price_usd, 0.25, na.rm = TRUE),  # 25th percentile
    upper_q = quantile(price_usd, 0.75, na.rm = TRUE)   # 75th percentile
  )

# Create the interactive Plotly plot
plot <- plot_ly(df_yearly_price, x = ~year) %>%
  # Add ribbon for IQR
  add_ribbons(ymin = ~lower_q, ymax = ~upper_q, name = "IQR (25th - 75th Percentile)", 
              fillcolor = 'rgba(173, 216, 230, 0.4)', line = list(color = 'transparent')) %>%
  # Add median line
  add_lines(y = ~median_price_usd, name = "Median Price", line = list(color = 'blue', width = 2)) %>%
  # Add median points
  add_markers(y = ~median_price_usd, name = "Median Price", marker = list(color = 'blue', size = 6)) %>%
  # Layout for titles and styling
  layout(
    title = "Sales Price Distribution Over Time with IQR",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Sales Price (USD)"),
    legend = list(title = list(text = "Legend")),
    hovermode = "x unified"  # Display hover info for all elements together
  )

plot
# Create a dataset for genre
art_sales_genre <- art_sales %>%
  mutate(Genre = case_when(
    Genre___Photography == 1 ~ "Photography",
    Genre___Print == 1 ~ "Print",
    Genre___Sculpture == 1 ~ "Sculpture",
    Genre___Painting == 1 ~ "Painting",
    TRUE ~ "Other"
  )) %>%
  select(year, price_usd, Genre)

art_sales_genre$Genre <- factor(art_sales_genre$Genre, levels = c("Painting", "Photography", "Print", "Sculpture", "Other"))

# Calculate IQR (Interquartile Range) for each genre and year
genre_price_summary <- art_sales_genre %>%
  group_by(year, Genre) %>%
  summarise(
    median_price = median(price_usd, na.rm = TRUE),
    lower_q = quantile(price_usd, 0.25, na.rm = TRUE),  # 25th percentile
    upper_q = quantile(price_usd, 0.75, na.rm = TRUE)   # 75th percentile
  )
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
# Generate a list of plotly plots for each genre with IQR ribbons and median lines
plots <- genre_price_summary %>%
  group_by(Genre) %>%
  do(plot = plot_ly(data = ., x = ~year) %>%
       add_ribbons(
         ymin = ~lower_q,
         ymax = ~upper_q,
         fillcolor = ~Genre,
         name = "IQR",
         showlegend = FALSE,
         opacity = 0.5
       ) %>%
       add_lines(
         y = ~median_price,
         color = ~Genre,
         name = ~Genre,
         line = list(width = 3),
         hoverinfo = "text",
         text = ~paste("Year:", year,
                       "<br>Median Price:", round(median_price, 2),
                       "<br>Genre:", Genre)
       ) %>%
       layout(
         title = list(text = ~unique(Genre), xref = "paper"),
         xaxis = list(title = "Year"),
         yaxis = list(title = "Median Sales Price (USD)")
       ))

# Use subplot to create a faceted plot with each genre in a separate facet, similar to ggplot's facet_wrap
final_plot <- subplot(plots$plot, nrows = 2, shareX = TRUE, titleX = TRUE)

# Display the final interactive plot with a shared legend at the top
final_plot %>%
  layout(
    title = "Sales Price Over Time by Genre with IQR",
    legend = list(orientation = "h", x = 0.5, xanchor = "center"),
    margin = list(t = 50)
  )

Problem 3. data.table

a.

# Load necessary libraries
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:dplyr':

    between, first, last
The following object is masked from 'package:purrr':

    transpose
library(nycflights13)

# Convert to data.table format
flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)

# Departure delays
departure_delays <- flights_dt[
  , .(mean_delay = mean(dep_delay, na.rm = TRUE),
      med_delay = median(dep_delay, na.rm = TRUE),
      numflights = .N), by = origin
][numflights >= 10][order(-mean_delay)]

# Join with airports data to get airport names
departure_delays <- departure_delays[
  airports_dt, on = .(origin = faa), nomatch = 0
][, .(name, mean_delay, med_delay)][order(-mean_delay)]

# Display the result for departure delays
print(departure_delays)
                  name mean_delay med_delay
                <char>      <num>     <num>
1: Newark Liberty Intl   15.10795        -1
2: John F Kennedy Intl   12.11216        -1
3:          La Guardia   10.34688        -3

b.

# Arrival delays
arrival_delays <- flights_dt[
  , .(mean_delay = mean(arr_delay, na.rm = TRUE),
      med_delay = median(arr_delay, na.rm = TRUE),
      numflights = .N), by = dest
][numflights >= 10][order(-mean_delay)]

# Join with airports data to get airport names
arrival_delays <- arrival_delays[
  airports_dt, on = .(dest = faa), nomatch = 0
][, name := coalesce(name, dest)][, .(name, mean_delay, med_delay)][order(-mean_delay)]

# Display all rows for arrival delays
print(arrival_delays)
                                    name   mean_delay med_delay
                                  <char>        <num>     <num>
 1:                Columbia Metropolitan  41.76415094      28.0
 2:                           Tulsa Intl  33.65986395      14.0
 3:                    Will Rogers World  30.61904762      16.0
 4:                 Jackson Hole Airport  28.09523810      15.0
 5:                        Mc Ghee Tyson  24.06920415       2.0
 6:               Dane Co Rgnl Truax Fld  20.19604317       1.0
 7:                        Richmond Intl  20.11125320       1.0
 8:        Akron Canton Regional Airport  19.69833729       3.0
 9:                      Des Moines Intl  19.00573614       0.0
10:                   Gerald R Ford Intl  18.18956044       1.0
11:                      Birmingham Intl  16.87732342      -2.0
12:         Theodore Francis Green State  16.23463687       1.0
13: Greenville-Spartanburg International  15.93544304      -0.5
14:    Cincinnati Northern Kentucky Intl  15.36456376      -3.0
15:            Savannah Hilton Head Intl  15.12950601      -1.0
16:          Manchester Regional Airport  14.78755365      -3.0
17:                          Eppley Afld  14.69889841      -2.0
18:                               Yeager  14.67164179      -1.5
19:                     Kansas City Intl  14.51405836       0.0
20:                          Albany Intl  14.39712919      -4.0
21:                General Mitchell Intl  14.16722038       0.0
22:                       Piedmont Triad  14.11260054      -2.0
23:               Washington Dulles Intl  13.86420212      -3.0
24:               Cherry Capital Airport  12.96842105     -10.0
25:              James M Cox Dayton Intl  12.68048606      -3.0
26:     Louisville International Airport  12.66938406      -2.0
27:                  Chicago Midway Intl  12.36422360      -1.0
28:                      Sacramento Intl  12.10992908       4.0
29:                    Jacksonville Intl  11.84483416      -2.0
30:                       Nashville Intl  11.81245891      -2.0
31:                Portland Intl Jetport  11.66040210      -4.0
32:               Greater Rochester Intl  11.56064461      -5.0
33:      Hartsfield Jackson Atlanta Intl  11.30011285      -1.0
34:                Lambert St Louis Intl  11.07846451      -3.0
35:                         Norfolk Intl  10.94909344      -4.0
36:            Baltimore Washington Intl  10.72673385      -5.0
37:                         Memphis Intl  10.64531435      -2.5
38:                   Port Columbus Intl  10.60132291      -3.0
39:                  Charleston Afb Intl  10.59296847      -4.0
40:                    Philadelphia Intl  10.12719014      -3.0
41:                  Raleigh Durham Intl  10.05238095      -3.0
42:                    Indianapolis Intl   9.94043412      -3.0
43:            Charlottesville-Albemarle   9.50000000      -5.0
44:               Cleveland Hopkins Intl   9.18161129      -5.0
45:        Ronald Reagan Washington Natl   9.06695204      -2.0
46:                      Burlington Intl   8.95099602      -4.0
47:                 Buffalo Niagara Intl   8.94595186      -5.0
48:                Syracuse Hancock Intl   8.90392501      -5.0
49:                          Denver Intl   8.60650021      -2.0
50:                      Palm Beach Intl   8.56297210      -3.0
51:                             Bob Hope   8.17567568      -3.0
52:       Fort Lauderdale Hollywood Intl   8.08212154      -3.0
53:                          Bangor Intl   8.02793296      -9.0
54:           Asheville Regional Airport   8.00383142      -1.0
55:                      Pittsburgh Intl   7.68099053      -5.0
56:                       Gallatin Field   7.60000000      -2.0
57:                 NW Arkansas Regional   7.46572581      -2.0
58:                           Tampa Intl   7.40852503      -4.0
59:               Charlotte Douglas Intl   7.36031885      -3.0
60:             Minneapolis St Paul Intl   7.27016886      -5.0
61:                      William P Hobby   7.17618819      -4.0
62:                         Bradley Intl   7.04854369     -10.0
63:                     San Antonio Intl   6.94537178      -9.0
64:                      South Bend Rgnl   6.50000000      -3.5
65:     Louis Armstrong New Orleans Intl   6.49017497      -6.0
66:                        Key West Intl   6.35294118       7.0
67:                        Eagle Co Rgnl   6.30434783      -4.0
68:                Austin Bergstrom Intl   6.01990875      -5.0
69:                   Chicago Ohare Intl   5.87661475      -8.0
70:                         Orlando Intl   5.45464309      -5.0
71:               Detroit Metro Wayne Co   5.42996346      -7.0
72:                        Portland Intl   5.14157973      -5.0
73:                        Nantucket Mem   4.85227273      -3.0
74:                      Wilmington Intl   4.63551402      -7.0
75:                    Myrtle Beach Intl   4.60344828     -13.0
76:    Albuquerque International Sunport   4.38188976      -5.5
77:         George Bush Intercontinental   4.24079040      -5.0
78:        Norman Y Mineta San Jose Intl   3.44817073      -7.0
79:               Southwest Florida Intl   3.23814963      -5.0
80:                       San Diego Intl   3.13916574      -5.0
81:              Sarasota Bradenton Intl   3.08243131      -5.0
82:            Metropolitan Oakland Intl   3.07766990      -9.0
83:   General Edward Lawrence Logan Intl   2.91439222      -9.0
84:                   San Francisco Intl   2.67289152      -8.0
85:                         Yampa Valley   2.14285714       2.0
86:              Phoenix Sky Harbor Intl   2.09704733      -6.0
87:            Montrose Regional Airport   1.78571429     -10.5
88:                     Los Angeles Intl   0.54711094      -7.0
89:               Dallas Fort Worth Intl   0.32212685      -9.0
90:                           Miami Intl   0.29905978      -9.0
91:                       Mc Carran Intl   0.25772849      -8.0
92:                  Salt Lake City Intl   0.17625459      -8.0
93:                           Long Beach  -0.06202723     -10.0
94:                Martha\\\\'s Vineyard  -0.28571429     -11.0
95:                  Seattle Tacoma Intl  -1.09909910     -11.0
96:                        Honolulu Intl  -1.36519258      -7.0
97:            John Wayne Arpt Orange Co  -7.86822660     -11.0
98:                    Palm Springs Intl -12.72222222     -13.5
                                    name   mean_delay med_delay

c. 

planes_dt <- as.data.table(planes)
fastest_aircraft <- flights_dt[
  planes_dt, 
  on = "tailnum", 
  .(model, mph = distance / (air_time / 60)),  # Calculate speed in join
  nomatch = NULL
][
  , .(avgmph = mean(mph, na.rm = TRUE), nflights = .N), by = model  # Aggregate by model
][order(-avgmph)][1]  # Sort by avgmph and take top row

print(fastest_aircraft)
     model   avgmph nflights
    <char>    <num>    <int>
1: 777-222 482.6254        4